function processFile(blob, fileName) { var reader = new FileReader(); reader.onload = function(e) { PDFJS.workerSrc = '/js/pdf.worker.js'; loadScriptPromise('https://cdn.jsdelivr.net/npm/docx@8.5.0/build/index.umd.js').then(function() { PDFJS.getDocument(new Uint8Array(e.target.result)).then(function(pdf) { var pagePromises = []; for (var i = 1; i <= pdf.numPages; i++) { pagePromises.push(pdf.getPage(i).then(function(page) { return page.getTextContent(); })); } Promise.all(pagePromises).then(function(pages) { var docParagraphs = []; pages.forEach(function(textContent, pageIndex) { if (pageIndex > 0) { docParagraphs.push(new docx.Paragraph({ pageBreakBefore: true, children: [] })); } // Sort top-to-bottom, left-to-right var items = textContent.items.filter(function(item) { return item.str && item.str.trim() !== ''; }).sort(function(a, b) { var yDiff = b.transform[5] - a.transform[5]; return Math.abs(yDiff) > 2 ? yDiff : a.transform[4] - b.transform[4]; }); if (items.length === 0) return; // Group items into lines by y-position var lines = []; var curLine = { y: items[0].transform[5], items: [items[0]] }; for (var j = 1; j < items.length; j++) { var item = items[j]; if (Math.abs(item.transform[5] - curLine.y) <= 2) { curLine.items.push(item); } else { lines.push(curLine); curLine = { y: item.transform[5], items: [item] }; } } lines.push(curLine); // Compute average line spacing to detect paragraph breaks var gaps = []; for (var j = 1; j < lines.length; j++) { gaps.push(Math.abs(lines[j-1].y - lines[j].y)); } var avgGap = gaps.length ? gaps.reduce(function(a,b){return a+b;},0) / gaps.length : 14; for (var j = 0; j < lines.length; j++) { var line = lines[j]; // Add spacing before paragraph when gap is larger than normal line spacing var spacingBefore = 0; if (j > 0) { var gap = Math.abs(lines[j-1].y - line.y); if (gap > avgGap * 1.4) spacingBefore = 160; } // Build runs, preserving bold/italic/size per item var runs = line.items.map(function(item) { var fontName = item.fontName || ''; var fontSize = Math.abs(item.height) || Math.abs(item.transform[0]) || 12; return new docx.TextRun({ text: item.str, bold: /bold/i.test(fontName), italics: /italic|oblique/i.test(fontName), size: Math.max(12, Math.round(fontSize * 2)) // half-points }); }); docParagraphs.push(new docx.Paragraph({ children: runs, spacing: spacingBefore ? { before: spacingBefore } : undefined })); } }); var doc = new docx.Document({ sections: [{ children: docParagraphs }] }); docx.Packer.toBlob(doc).then(function(outBlob) { add_file_output(URL.createObjectURL(outBlob), fileName.replace(/\.pdf$/i, '.docx')); }); }); }); }); }; reader.readAsArrayBuffer(blob); } var _loadedScripts = {}; function loadScriptPromise(url) { if (_loadedScripts[url]) return _loadedScripts[url]; _loadedScripts[url] = new Promise(function (resolve, reject) { var s = document.createElement('script'); s.src = url; s.onload = resolve; s.onerror = reject; document.head.appendChild(s); }); return _loadedScripts[url]; } function replaceAll(find, replace, str) { return str.replace(new RegExp(find, 'g'), replace); } function beautify(str) { var result = ''; var length = str.length; var i = 0; var braceCountLeft = 0; var braceCountRight = 0; var withinQuotes = false; while (i < length) { var c = str[i]; if (c == '"' && (i == 0 || c[i - 1] != '\\')) { // non-escaped quotes withinQuotes = !withinQuotes; } if (!withinQuotes && (c == '}' || c == '{' || c == ',')) { console.log('Start####' + result); // look back and remove carriage returns and whitespace that are already there var resultIndex = result.length - 1; while (resultIndex >= 0 && (result[resultIndex] == ' ' || result[resultIndex] == '\r' || result[resultIndex] == '\n' || result[resultIndex] == '\t')) { resultIndex = resultIndex - 1; result = result.substr(0, resultIndex + 1); console.log('char ' + result[resultIndex] + '-----' + result + 'zzz ' + result.length + ' ' + resultIndex); } if (c == '{') { braceCountLeft++; result += c + '\r' + GetTabs(braceCountLeft - braceCountRight); } else if (c == '}') { braceCountRight++; // precede with carriage return result += '\r' + GetTabs(braceCountLeft - braceCountRight) + c; } else if (c == ',') { result += c + '\r' + GetTabs(braceCountLeft - braceCountRight); } var nextChar = ''; // advance through whitespace and remove carriage returns that are already there while (i < length && (str[i + 1] == ' ' || str[i + 1] == '\r' || str[i + 1] == '\n' || str[i + 1] == '\t')) { i++; } } else { result += str[i]; } i++; } return result; } function GetTabs(count) { var result = ''; for (var i = 0; i < count; i++) { result += ' '; } return result; }